options(scipen=999, warn = -1)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
courses <- as_tibble(read.csv("C:/Users/crane/Documents/MA342/Project/courses.csv"))
institutions <- read.csv("C:/Users/crane/Documents/MA342/Project/Institutions.csv")


#remove special characters in tuition, fees, placement, and wage columns
#forcing  columns as numeric replaces the ^ and - characters present in data set to NA, which is what we want
courses <- courses %>%
  mutate(inst_id = as.numeric(inst_id)) %>%
  mutate(AnnualTuition = as.numeric(str_replace(AnnualTuition, "[$%]", ""))) %>%
  mutate(AnnualFees = as.numeric(str_replace(AnnualFees, "[$%]", ""))) %>%
  mutate(Placement = as.numeric(str_replace(Placement, "[$%]", ""))) %>%
  mutate(MedWageEntry = as.numeric(str_replace(MedWageEntry, "[$%]", ""))) %>%
  mutate(MedWage5yr = as.numeric(str_replace(MedWage5yr, "[$%]", "")))

#add columns for total cost per year
courses <- courses %>%
  mutate(AnnualCost = AnnualTuition + AnnualFees) %>%
  mutate(TotalEstimatedCost = (RequiredHours/30)*AnnualCost) %>%
  mutate(Med5YrValue = MedWage5yr/TotalEstimatedCost) %>%
  mutate(MedEntryValue = MedWageEntry/TotalEstimatedCost)


#__________________________________________________________________________________________________
#plot showing cost of degree vs median entry wage  of all degrees, color by required hours
#these two plots best answer the question, what degree from KHE schools have the best value
#both of these plots are interactive, meaning they show additional information when you
#hover over one of the data points but must be viewed as an html file
plt1 <- courses %>%
  left_join(institutions, by = "inst_id") %>%
  filter(TotalEstimatedCost < 75000) %>%
  ggplot() +
  geom_point(mapping = aes(x = TotalEstimatedCost, y = MedWageEntry, color = RequiredHours, text = paste0(DegreeTitle, " ", InstitutionName))) +
  ggtitle("Total Cost versus Median Entry Level Salary")

ggplotly(plt1)
plt2 <- courses %>%
  left_join(institutions, by = "inst_id") %>%
  filter(TotalEstimatedCost < 75000) %>%
  ggplot() +
  geom_point(mapping = aes(x = TotalEstimatedCost, y = MedWage5yr, color = RequiredHours, text = paste0(DegreeTitle, " ", InstitutionName))) +
  ggtitle("Total Cost versus Median Salary After 5 years")

ggplotly(plt2)
plt3 <- courses %>%
  left_join(institutions, by = "inst_id") %>%
  ggplot()+
  geom_point(mapping = aes(x = TotalEstimatedCost, y = Placement, color = RequiredHours, text = paste0(DegreeTitle, " ", InstitutionName))) +
  ggtitle("Total Cost vs Placement%")

 ggplotly(plt3)
 #Using the results of these 3 charts, we can make a determination that associates degrees
 #have the highest value in terms of cost vs wage and placement percentages
 
 #its also interesting to note that the points are in 3 distinct groups based on cost
 #1) two year colleges <20000
 #2) Fort Hayes, Pittsburg, Emporia >20000 and < 40000
 #3) KU, KSU, WSU, WU >40000
 
 #obviously an associates degree isn't not for everyone, so below is a graph of just the bachelors degrees
 plt4 <- courses %>%
   left_join(institutions, by = "inst_id") %>%
   filter(RequiredHours >= 120) %>%
   ggplot() +
   geom_point(mapping = aes(x = TotalEstimatedCost, y = MedWage5yr, color = RequiredHours, text = paste0(DegreeTitle, " ", InstitutionName))) +
   ggtitle("Total Cost versus Median Salary After 5 years (Bachelors Degrees)")
 
 ggplotly(plt4)
#___________________________________________________________________________________________________

#plot showing distribution of wages by university
courses %>%
  left_join(institutions, by = "inst_id") %>%
  filter(inst_type %in% c("State University", "Municipal University")) %>%
  ggplot() +
  geom_boxplot(mapping = aes(x = InstitutionName, y = MedWageEntry)) +
  coord_flip()

#placement by university
courses %>%
  left_join(institutions, by = "inst_id") %>%
  filter(inst_type %in% c("State University", "Municipal University")) %>%
  ggplot() +
  geom_boxplot(mapping = aes(x = InstitutionName, y = Placement)) +
  coord_flip()

#university by costs
courses %>%
  left_join(institutions, by = "inst_id") %>%
  filter(inst_type %in% c("State University", "Municipal University")) %>%
  ggplot() +
  geom_boxplot(mapping = aes(x = InstitutionName, y = TotalEstimatedCost)) +
  coord_flip()

#find highest paying degrees
courses %>%
  group_by(DegreeTitle) %>%
  summarize(meanWage = mean(MedWage5yr, na.rm = TRUE)) %>%
  arrange(desc(meanWage))
## # A tibble: 434 × 2
##    DegreeTitle                        meanWage
##    <chr>                                 <dbl>
##  1 BUSINESS ANALYTICS                  107783 
##  2 ELECTRICAL & POWER TRANSMISSION     103802 
##  3 ARCHITECTURAL ENGINEERING           103643 
##  4 ELECTRONICS ENGINEERING TECHNOLOGY  102894 
##  5 COMPUTER ENGINEERING                101775.
##  6 APPLIED COMPUTING                   101604 
##  7 ELECTRIC POWER AND DISTRIBUTION     101415 
##  8 ELECTRICAL ENGINEERING               99578 
##  9 COMPUTER SCIENCE                     95220.
## 10 PETROLEUM ENGINEERING                94728 
## # ℹ 424 more rows
#find which degrees/programs have highest wage vs cost
courses %>%
  group_by(DegreeTitle) %>%
  summarize(meanValue = mean(Med5YrValue, na.rm = TRUE), meanWage = mean(MedWage5yr, na.rm = TRUE), n = n()) %>%
  filter(n > 4) %>%
  arrange(desc(meanValue))
## # A tibble: 39 × 4
##    DegreeTitle                                          meanValue meanWage     n
##    <chr>                                                    <dbl>    <dbl> <int>
##  1 COSMETOLOGY                                               6.07   38076      5
##  2 LIBERAL ARTS AND SCIENCES, GENERAL STUDIES AND HUMA…      5.50   43052.    10
##  3 WELDING TECHNOLOGY                                        5.40   51550      7
##  4 AA, AS, AGS DEGREES (TRANSFER DEGREES)                    5.10   43240.    17
##  5 AUTOMOTIVE TECHNOLOGY                                     4.66   56143.    13
##  6 ELECTRICAL TECHNOLOGY                                     4.36   53807.     6
##  7 DIESEL TECHNOLOGY                                         4.24   54065.     5
##  8 PHYSICAL THERAPIST ASSISTANT                              4.12   49155      5
##  9 LIBERAL STUDIES                                           4.05   41600.     9
## 10 SURGICAL TECHNOLOGY                                       3.93   48051.     5
## # ℹ 29 more rows
#compare wages by inst type
courses %>%
  left_join(institutions, by = "inst_id") %>%
  group_by(inst_type) %>%
  filter(!is.na(inst_type)) %>%
  ggplot() +
  geom_boxplot(mapping = aes(x = inst_type, y = MedWageEntry)) +
  coord_flip()

courses %>%
  left_join(institutions, by = "inst_id") %>%
  group_by(inst_type) %>%
  filter(!is.na(inst_type)) %>%
  ggplot() +
  geom_boxplot(mapping = aes(x = inst_type, y = MedWage5yr)) +
  coord_flip()

#compare only engineering degrees
courses %>%
  left_join(institutions, by = "inst_id") %>%
  filter(str_detect(DegreeTitle, "ENGINEERING")) %>%
  filter(inst_type %in% c("State University", "Municipal University")) %>%
  ggplot() +
  geom_boxplot(mapping = aes(x = InstitutionName, y = MedWage5yr)) +
  coord_flip()